
#########################################################################
############                GET CHILD DATASET               #############
#########################################################################

library(haven)
library(readr)
library(data.table)
library(scales)

# using TIRC (not XTIRC)

# If above code is too large to run, run this instead:
panel <- fread("H:/Zheng_10223/Joint/child_year_panel2025.csv", 
               select=c("MAIN_PARENT", "PARENT2", "PARENT1", "IMDB_ID", "Year", "LANDING_AGE", "BIRTH_YEAR", 
                        "Child_EmpInc_IND", "Child_EmpInc_HH", "Child_TIRC_IND", "Child_TIRC_HH", 
                        "Parent1_TIRC_HH", "Parent2_TIRC_HH", "Parent1_TIRC_IND","Parent2_TIRC_IND",
                        "MainParent_EmpInc_HH", "MainParent_TIRC_IND", "MainParent_TIRC_HH","Child_IMM_STAT","CHILD_EMIG_DATE",
                        "MainParent_IMM_STAT","MainParent_EMIG_DATE","Parent1_IMM_STAT","Parent1_EMIG_DATE",
                        "Parent2_IMM_STAT","Parent2_EMIG_DATE",
                        "Parent3_IMM_STAT","Parent3_EMIG_DATE",
                        "Parent4_IMM_STAT","Parent4_EMIG_DATE")
              )



# Drop children without parents:
panel <- panel[!(MAIN_PARENT=="")]

# Impute negative income to missing
panel[panel<0] <- NA

# Need to deflate to real income (2020$)
cpi <- read_dta("H:/Zheng_10223/Joint/cpi/cpi.dta")
panel <- merge(x=panel, y=cpi, by.x="Year", by.y="year", all.x=T)
panel[,c("Child_EmpInc_IND", "Child_TIRC_IND", "Child_TIRC_HH")] <- panel[,c("Child_EmpInc_IND", "Child_TIRC_IND", "Child_TIRC_HH")]*cpi$cpi[which(cpi$year==2020)]/panel$cpi
panel[,c( "Parent1_TIRC_IND", "Parent1_TIRC_HH")] <- panel[,c( "Parent1_TIRC_IND", "Parent1_TIRC_HH")]*cpi$cpi[which(cpi$year==2020)]/panel$cpi
panel[,c( "Parent2_TIRC_IND", "Parent2_TIRC_HH")] <- panel[,c( "Parent2_TIRC_IND", "Parent2_TIRC_HH")]*cpi$cpi[which(cpi$year==2020)]/panel$cpi
panel[,c( "MainParent_TIRC_IND", "MainParent_TIRC_HH")] <- panel[,c("MainParent_TIRC_IND", "MainParent_TIRC_HH")]*cpi$cpi[which(cpi$year==2020)]/panel$cpi

# Winsorize income at top 0.1% (among immigrant sample...)
panel$Child_EmpInc_IND[which(panel$Child_EmpInc_IND>=quantile(panel$Child_EmpInc_IND, 0.999, na.rm=T))] <- quantile(panel$Child_EmpInc_IND, 0.999, na.rm=T)
panel$Child_TIRC_IND[which(panel$Child_TIRC_IND>=quantile(panel$Child_TIRC_IND, 0.999, na.rm=T))] <- quantile(panel$Child_TIRC_IND, 0.999, na.rm=T)
panel$Child_TIRC_HH[which(panel$Child_TIRC_HH>=quantile(panel$Child_TIRC_HH, 0.999, na.rm=T))] <- quantile(panel$Child_TIRC_HH, 0.999, na.rm=T)
panel$MainParent_TIRC_HH[which(panel$MainParent_TIRC_HH>=quantile(panel$MainParent_TIRC_HH, 0.999, na.rm=T))] <- quantile(panel$MainParent_TIRC_HH, 0.999, na.rm=T)
panel$Parent1_TIRC_HH[which(panel$Parent1_TIRC_HH>=quantile(panel$Parent1_TIRC_HH, 0.999, na.rm=T))] <- quantile(panel$Parent1_TIRC_HH, 0.999, na.rm=T)
panel$Parent2_TIRC_HH[which(panel$Parent2_TIRC_HH>=quantile(panel$Parent2_TIRC_HH, 0.999, na.rm=T))] <- quantile(panel$Parent2_TIRC_HH, 0.999, na.rm=T)
panel$MainParent_TIRC_IND[which(panel$MainParent_TIRC_IND>=quantile(panel$MainParent_TIRC_IND, 0.999, na.rm=T))] <- quantile(panel$MainParent_TIRC_IND, 0.999, na.rm=T)
panel$Parent1_TIRC_IND[which(panel$Parent1_TIRC_IND>=quantile(panel$Parent1_TIRC_IND, 0.999, na.rm=T))] <- quantile(panel$Parent1_TIRC_IND, 0.999, na.rm=T)
panel$Parent2_TIRC_IND[which(panel$Parent2_TIRC_IND>=quantile(panel$Parent2_TIRC_IND, 0.999, na.rm=T))] <- quantile(panel$Parent2_TIRC_IND, 0.999, na.rm=T)

# Get basic landing information for main parent
landing <- read_dta("G:/IMDB_AllYears/rdc/IMDB_BDIM_2022_v1/data_donnees/stata/Core_IMDB/pnrf_1980_2022_f3_v1.dta")
landing <- data.table(landing)

# merge in parent landing info ( note: BIRTH_YEAR in "panel" is child birth year )
panel <- merge(x=panel, y=landing[,c("IMDB_ID", "LANDING_YEAR", "FIRST_TAX_YEAR", "LAST_TAX_YEAR", "DEATH_YEAR", "CITIZEN_YEAR","BIRTH_YEAR","gender")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
setnames(panel, old=c("BIRTH_YEAR.x","BIRTH_YEAR.y","LANDING_YEAR", "FIRST_TAX_YEAR", "LAST_TAX_YEAR", "DEATH_YEAR", "CITIZEN_YEAR","gender"), new=c("BIRTH_YEAR_child","BirthYear_MainParent","LandingYear_MainParent", "FIRST_TAX_YEAR_MainParent", "LAST_TAX_YEAR_MainParent", "DEATH_YEAR_MainParent", "CITIZEN_YEAR_MainParent","gender_MainParent"))

# get basic landing information for parent1 and parent 2 including gender to get fathers
panel <- merge (x=panel, y=landing[,c("IMDB_ID", "LANDING_YEAR", "FIRST_TAX_YEAR", "LAST_TAX_YEAR", "DEATH_YEAR", "BIRTH_YEAR","gender")], by.x="PARENT1", by.y="IMDB_ID", all.x=T)
setnames(panel, old=c("BIRTH_YEAR","LANDING_YEAR", "FIRST_TAX_YEAR", "LAST_TAX_YEAR", "DEATH_YEAR","gender"), new=c("BirthYear_Parent1","LandingYear_PARENT1", "FIRST_TAX_YEAR_PARENT1", "LAST_TAX_YEAR_PARENT1", "DEATH_YEAR_PARENT1","gender_parent1"))

panel <- merge (x=panel, y=landing[,c("IMDB_ID", "LANDING_YEAR", "FIRST_TAX_YEAR", "LAST_TAX_YEAR", "DEATH_YEAR", "BIRTH_YEAR","gender")], by.x="PARENT2", by.y="IMDB_ID", all.x=T)
setnames(panel, old=c("BIRTH_YEAR","LANDING_YEAR", "FIRST_TAX_YEAR", "LAST_TAX_YEAR", "DEATH_YEAR","gender"), new=c("BirthYear_Parent2","LandingYear_PARENT2", "FIRST_TAX_YEAR_PARENT2", "LAST_TAX_YEAR_PARENT2", "DEATH_YEAR_PARENT2","gender_parent2"))


### Attempt to follow Chetty et al. 2014 where missing tax returns are imputed to zero
# Problem with imputing missing to zero is that a ton of children and parents never file a tax return
# Some children may have left the country (e.g., go to US for school)
# Solution: 
# i) Drop if child do not file AT LEAST ONE tax return between ages 30-34
# ii) Drop if parents do not file AT LEAST ONE tax returns within 10 years of landing
# This effectively means that any child or parent who is alive and filed at least one tax return between ages 30-34 OR within 10Y of landing, 
# will get missing tax returns imputed as zero. IF instead they do not fill a tax return following the above criteria, then they are dropped
panel[, `:=` (Child_AnyTax30_34=max(!is.na(Child_TIRC_HH[which((Year-BIRTH_YEAR_child) %in% c(30,31,32,33,34))])),
              Parent_AnyTaxPost10=max(!is.na(MainParent_TIRC_HH[which((Year-LandingYear_MainParent) %in% c(0:10))]))), by="IMDB_ID"]
panel <- panel[Child_AnyTax30_34==1 & Parent_AnyTaxPost10==1]

# Replace missing tax returns (provided between landing and last year) with zero
panel[, `:=` (MinDeathLeave_MainParent=pmin(DEATH_YEAR_MainParent, LAST_TAX_YEAR_MainParent, na.rm=T)), by="IMDB_ID"] # get the min between death year and last tax year
setnafill(panel, cols=c("MainParent_TIRC_HH", "Parent1_TIRC_HH", "Parent2_TIRC_HH"), fill=0)
panel[Year<FIRST_TAX_YEAR_MainParent | Year>MinDeathLeave_MainParent, c("MainParent_TIRC_HH", "Parent1_TIRC_HH", "Parent2_TIRC_HH")] <- NA

# Children: 
panel <- merge(x=panel, y=landing[,c("IMDB_ID", "DEATH_YEAR")], by.x="IMDB_ID", by.y="IMDB_ID", all.x=T)
setnames(panel, old=c("DEATH_YEAR"), new="DEATH_YEAR_Child")
# DO NOT FILL IN MISSINGS TO ZERO- that's the Connolly version; missings not included
#setnafill(panel, cols=c("Child_TIRC_HH", "Child_EmpInc_IND"), fill=0)
panel[Year>DEATH_YEAR_Child, c("Child_EmpInc_IND", "Child_TIRC_IND", "Child_TIRC_HH")] <- NA


# Spouse income
panel$Child_TIRC_spouse=panel$Child_TIRC_HH - panel$Child_TIRC_IND


# Aggregate to child-level dataset with relevant child and parent incomes
sample <- panel[, .(
  # Child average total household income at 30-34 (HERE WE REMOVE missings from the calculation)
  Child_Income_HH_30_34=mean(Child_TIRC_HH[which((Year-BIRTH_YEAR_child) %in% c(30,31,32,33,34))], na.rm=T),
  
  # Child average total individual income at 30-34
  Child_Income_IND_30_34=mean(Child_TIRC_IND[which((Year-BIRTH_YEAR_child) %in% c(30,31,32,33,34))], na.rm=T),
  
  # Child average individual earnings at 30-34
  Child_EmpInc_IND_30_34=mean(Child_EmpInc_IND[which((Year-BIRTH_YEAR_child) %in% c(30,31,32,33,34))], na.rm=T),
  
  
  # Child spouse income when child is 30-34
  Child_Income_Spouse_30_34=mean(Child_TIRC_spouse[which((Year-BIRTH_YEAR_child) %in% c(30,31,32,33,34))], na.rm=T),
  
  
  # Main parent household total income when child is 15-19
  MainParent_Income_HH_15_19=mean(MainParent_TIRC_HH[which((Year-BIRTH_YEAR_child) %in% c(15,16,17,18,19))], na.rm=T),
  
  # Main parent household total income when child is 10-19
  MainParent_Income_HH_10_19=mean(MainParent_TIRC_HH[which((Year-BIRTH_YEAR_child) %in% c(10,11,12,13,14,15,16,17,18,19))], na.rm=T),
  
  # Main parent age 45-49 (following Connolly et al., though note that they do mothers only)
  MainParent_Income_HH_MainParentAge45_49=mean(MainParent_TIRC_HH[which((Year-BirthYear_MainParent) %in% c(45,46,47,48,49))], na.rm=T),
  
  
  # Main parent age 45-49 IND (following Connolly et al., though note that they do mothers only)
  MainParent_Income_IND_MainParentAge45_49=mean(MainParent_TIRC_IND[which((Year-BirthYear_MainParent) %in% c(45,46,47,48,49))], na.rm=T),
  
  
  
  # Top earner parent household total income when Parent2 is 12-16
  Parent1_Income_HH_15_19=mean(Parent1_TIRC_HH[which((Year-BIRTH_YEAR_child) %in% c(15,16,17,18,19))], na.rm=T),
  Parent2_Income_HH_15_19=mean(Parent2_TIRC_HH[which((Year-BIRTH_YEAR_child) %in% c(15,16,17,18,19))], na.rm=T),
  
  
  # Top earner parent /father household total income when the top earner parent is 45-49
  Parent1_Income_HH_45_49=mean(Parent1_TIRC_HH[which((Year-BirthYear_Parent1) %in% c(45,46,47,48,49))], na.rm=T),
  Parent2_Income_HH_45_49=mean(Parent2_TIRC_HH[which((Year-BirthYear_Parent2) %in% c(45,46,47,48,49))], na.rm=T),
  
  
  # Top earner parent /father IND total income when the top earner parent is 45-49
  Parent1_Income_IND_45_49=mean(Parent1_TIRC_IND[which((Year-BirthYear_Parent1) %in% c(45,46,47,48,49))], na.rm=T),
  Parent2_Income_IND_45_49=mean(Parent2_TIRC_IND[which((Year-BirthYear_Parent2) %in% c(45,46,47,48,49))], na.rm=T),
  
  
  # Main parent household total income 5 and 10 years after landing
  MainParent_Income_HH_PostLanding5=mean(MainParent_TIRC_HH[which((Year-LandingYear_MainParent) %in% c(3,4,5,6,7))], na.rm=T),
  MainParent_Income_HH_PostLanding10=mean(MainParent_TIRC_HH[which((Year-LandingYear_MainParent) %in% c(8,9,10,11,12))], na.rm=T)
), by="IMDB_ID"]

  
# household Get top parent income and identity : use top parent definition between 45 -49 age
sample$TopParent_Income_HH_45_49 <- pmax(sample$Parent1_Income_HH_45_49, sample$Parent2_Income_HH_45_49)
sample$TopParent <- "PARENT1"
sample$TopParent[which(sample$TopParent_Income_HH_45_49==sample$Parent2_Income_HH_45_49)] <- "PARENT2"

# if no parent 1 or parent 2, then it's just the main parent
sample$TopParent[which(is.na(sample$TopParent_Income_HH_45_49))]="Main Parent"
sample$TopParent_Income_HH_45_49[which(is.na(sample$TopParent_Income_HH_45_49))]=sample$MainParent_Income_HH_MainParentAge45_49[which(is.na(sample$TopParent_Income_HH_45_49))]


# individual  Get top parent income and identity : use top parent definition between 45 -49 age
sample$TopParent_Income_IND_45_49 <- pmax(sample$Parent1_Income_IND_45_49, sample$Parent2_Income_IND_45_49)
sample$TopParent <- "PARENT1"
sample$TopParent[which(sample$TopParent_Income_IND_45_49==sample$Parent2_Income_IND_45_49)] <- "PARENT2"

# if no parent 1 or parent 2, then it's just the main parent
sample$TopParent[which(is.na(sample$TopParent_Income_IND_45_49))]="Main Parent"
sample$TopParent_Income_IND_45_49[which(is.na(sample$TopParent_Income_IND_45_49))]=sample$MainParent_Income_IND_MainParentAge45_49[which(is.na(sample$TopParent_Income_IND_45_49))]



### Get income percentile
child_landing <- read_dta("H:/Zheng_10223/Joint/child_pnrf_1980_2020_f3_v1.dta")
child_landing <- data.table(child_landing)
child_landing$MAIN_PARENT=child_landing$main_parent; child_landing$main_parent=NULL
child_landing$PARENT1=child_landing$Parent1; child_landing$Parent1=NULL
child_landing$PARENT2=child_landing$Parent2; child_landing$Parent2=NULL
child_landing$CHILD_LAST_TAX_YEAR=child_landing$child_last_tax_year

# Income percentiles from the LAD (at year-age-level)
income_pct <- read.csv("H:/Zheng_10223/Joint/LAD/finaldist.csv", header=TRUE, stringsAsFactors = FALSE)
income_pct <- merge(x=income_pct, y=cpi, by.x="year", by.y="year", all.x=T)
income_pct$pctindTIRC_real <- income_pct$pctindTIRC*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct$pcthhTIRC_real <- income_pct$pcthhTIRC*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct$pctindearn_real <- income_pct$pctindearn*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct$pctparentTIRC_real <- income_pct$pctparentTIRC*cpi$cpi[which(cpi$year==2020)]/income_pct$cpi
income_pct <- data.table(income_pct)
income_pct <- income_pct[order(year, age, pct)]

# Get child and main parent landing year and landing age
sample <- merge(x=sample, y=child_landing[,c("IMDB_ID", "BIRTH_YEAR", "LANDING_YEAR", "LANDING_AGE", "PARENT1", "PARENT2", "MAIN_PARENT")], by.x="IMDB_ID", by.y="IMDB_ID")
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "LANDING_YEAR")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T) #Main parent's landing year
setnames(sample, old=c("LANDING_YEAR.x", "LANDING_YEAR.y", "BIRTH_YEAR"), new=c("LandingYear_Child", "LandingYear_MainParent", "BirthYear_Child"))

# get main parent last tax year
sample=merge(x=sample, y=landing[,c("IMDB_ID","LAST_TAX_YEAR")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T) 
setnames(sample, old=c("LAST_TAX_YEAR"), new=c("LAST_TAX_YEAR_MAINPARENT"))

# get child last tax year 
sample=merge(x=sample, y=child_landing[,c("IMDB_ID","CHILD_LAST_TAX_YEAR")], by.x="IMDB_ID", by.y="IMDB_ID", all.x=T) 



# Get birth year for main parent, parent 1, and parent 2 (latter 2 needed for top parent income)
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "BIRTH_YEAR")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)  
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "BIRTH_YEAR")], by.x="PARENT1", by.y="IMDB_ID", all.x=T) 
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "BIRTH_YEAR")], by.x="PARENT2", by.y="IMDB_ID", all.x=T) 
setnames(sample, old=c("BIRTH_YEAR.x", "BIRTH_YEAR.y", "BIRTH_YEAR"), new=c("BirthYear_MainParent", "BirthYear_Parent1", "BirthYear_Parent2"))


# Get gender for main parent, parent 1, and parent 2 (latter 2 needed for father income)
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "gender")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)  
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "gender")], by.x="PARENT1", by.y="IMDB_ID", all.x=T) 
sample <- merge(x=sample, y=landing[,c("IMDB_ID", "gender")], by.x="PARENT2", by.y="IMDB_ID", all.x=T) 
setnames(sample, old=c("gender.x", "gender.y", "gender"), new=c("gender_MainParent", "gender_Parent1", "gender_Parent2"))

# Identify Father:
sample$Father=""
sample$Father[sample$gender_MainParent==1]="MAIN_PARENT"
sample$Father[sample$Father=="" & sample$gender_Parent2==1]="Parent2"
sample$Father[sample$Father=="" & sample$gender_Parent1==1]="Parent1"

sample$Father_Income_HH_45_49[sample$Father=="MAIN_PARENT"]=sample$MainParent_Income_HH_MainParentAge45_49[sample$Father=="MAIN_PARENT"]
sample$Father_Income_HH_45_49[sample$Father=="Parent1"]=sample$Parent1_Income_HH_45_49[sample$Father=="Parent1"]
sample$Father_Income_HH_45_49[sample$Father=="Parent2"]=sample$Parent2_Income_HH_45_49[sample$Father=="Parent2"]

# if no father, replace with main parent:
sample$Father_Income_HH_45_49[is.na(sample$Father_Income_HH_45_49)]=sample$MainParent_Income_HH_MainParentAge45_49[is.na(sample$Father_Income_HH_45_49)]


# Get relevant year and age needed for percentile at a year/age
sample$YearAt30 <- sample$BirthYear_Child+30; sample$YearAt10 <- sample$BirthYear_Child+10; sample$YearAt15 <- sample$BirthYear_Child+15; 
sample$MainParentYearat45 <- sample$BirthYear_MainParent+45
sample$MainParentAgeAt15 <- sample$YearAt15-sample$BirthYear_MainParent;
sample$MainParentAgeAt10 <- sample$YearAt10-sample$BirthYear_MainParent;
sample$Parent1AgeAt15 <- sample$YearAt15-sample$BirthYear_Parent1;
sample$Parent2AgeAt15 <- sample$YearAt15-sample$BirthYear_Parent2;


sample$Parent1Yearat45 <- sample$BirthYear_Parent1 + 45;
sample$Parent2Yearat45 <- sample$BirthYear_Parent2 + 45;


sample$TopParentYearAt45 <- NA
sample$TopParentYearAt45[which(sample$TopParent=="Main Parent")] <- sample$MainParentYearat45[which(sample$TopParent=="Main Parent")]

sample$TopParentYearAt45[which(sample$TopParent=="PARENT1")] <- sample$Parent1Yearat45[which(sample$TopParent=="PARENT1")]
sample$TopParentYearAt45[which(sample$TopParent=="PARENT2")] <- sample$Parent2Yearat45[which(sample$TopParent=="PARENT2")]
sample$YearPost10Parent <- sample$LandingYear_MainParent+10; sample$AgePost10Parent <- sample$YearPost10Parent-sample$BirthYear_MainParent
sample$YearPost5Parent <- sample$LandingYear_MainParent+5; sample$AgePost5Parent <- sample$YearPost5Parent-sample$BirthYear_MainParent

sample$FatherYearAt45=NA
sample$FatherYearAt45[which(sample$Father=="MAIN_PARENT")]<-sample$MainParentYearat45[which(sample$Father=="MAIN_PARENT")]
sample$FatherYearAt45[which(sample$Father=="Parent2")]<-sample$Parent2Yearat45[which(sample$Father=="Parent2")]

sample$FatherYearAt45[which(sample$Father=="Parent1")]<-sample$Parent1Yearat45[which(sample$Father=="Parent1")]


# Placeholders for income percentiles
sample$Child_Income_HH_30_34_pct <- NA 
sample$Child_Income_Spouse_30_34_pct <- NA 
sample$Child_Income_IND_30_34_pct <- NA 
sample$Child_EmpInc_IND_30_34_pct <- NA 
sample$MainParent_Income_HH_15_19_pct <- NA
sample$MainParent_Income_HH_10_19_pct<- NA
sample$TopParent_Income_HH_15_19_pct <- NA


sample$TopParent_Income_HH_45_49_pctparent <- NA
sample$TopParent_Income_IND_45_49_pct <- NA

sample$Father_Income_HH_45_49_pctparent <- NA


sample$MainParent_Income_HH_PostLanding5_pcthh <- NA # percentile in the household distribution
sample$MainParent_Income_HH_PostLanding10_pcthh <- NA # percentile in the household distribution
sample$MainParent_Income_HH_PostLanding5_pctparent <- NA # percentile in the parent distribution
sample$MainParent_Income_HH_PostLanding10_pctparent <- NA # percentile in the parent distribution
sample$MainParent_Income_HH_MainParentAge45_49_pctparent<- NA
sample$MainParent_Income_HH_MainParentAge45_49_pcthh<- NA

sample$MainParent_Income_IND_MainParentAge45_49_pct<- NA

# Get child income percentiles (using Canadian 30year old)
for(i in 1992:2019){
  
  # distribution of child household income in the HOUSEHOLD INCOME DISTRIBUTION
  sample$Child_Income_HH_30_34_pct[which(sample$YearAt30==i)] <- findInterval(sample$Child_Income_HH_30_34[which(sample$YearAt30==i)], vec=income_pct$pcthhTIRC_real[which(income_pct$year==i & income_pct$age==30)])
  sample$Child_Income_HH_30_34_pct[which(sample$YearAt30==i & sample$Child_Income_HH_30_34_pct==min(sample$Child_Income_HH_30_34_pct[which(sample$YearAt30==i)], na.rm=T))] <-0.5*(1+sample$Child_Income_HH_30_34_pct[which(sample$YearAt30==i & sample$Child_Income_HH_30_34_pct==min(sample$Child_Income_HH_30_34_pct[which(sample$YearAt30==i)], na.rm=T))])
  
  
  sample$Child_Income_Spouse_30_34_pct[which(sample$YearAt30==i)] <- findInterval(sample$Child_Income_Spouse_30_34[which(sample$YearAt30==i)], vec=income_pct$pctindTIRC_real[which(income_pct$year==i & income_pct$age==30)])
  sample$Child_Income_Spouse_30_34_pct[which(sample$YearAt30==i & sample$Child_Income_Spouse_30_34_pct==min(sample$Child_Income_Spouse_30_34_pct[which(sample$YearAt30==i)], na.rm=T))] <-0.5*(1+sample$Child_Income_Spouse_30_34_pct[which(sample$YearAt30==i & sample$Child_Income_Spouse_30_34_pct==min(sample$Child_Income_Spouse_30_34_pct[which(sample$YearAt30==i)], na.rm=T))])
  
  
  
  sample$Child_Income_IND_30_34_pct[which(sample$YearAt30==i)] <- findInterval(sample$Child_Income_IND_30_34[which(sample$YearAt30==i)], vec=income_pct$pctindTIRC_real[which(income_pct$year==i & income_pct$age==30)])
  sample$Child_Income_IND_30_34_pct[which(sample$YearAt30==i & sample$Child_Income_IND_30_34_pct==min(sample$Child_Income_IND_30_34_pct[which(sample$YearAt30==i)], na.rm=T))] <-0.5*(1+sample$Child_Income_IND_30_34_pct[which(sample$YearAt30==i & sample$Child_Income_IND_30_34_pct==min(sample$Child_Income_IND_30_34_pct[which(sample$YearAt30==i)], na.rm=T))])
  
  sample$Child_EmpInc_IND_30_34_pct[which(sample$YearAt30==i)] <- findInterval(sample$Child_EmpInc_IND_30_34[which(sample$YearAt30==i)], vec=income_pct$pctindearn_real[which(income_pct$year==i & income_pct$age==30)])
  sample$Child_EmpInc_IND_30_34_pct[which(sample$YearAt30==i & sample$Child_EmpInc_IND_30_34_pct==min(sample$Child_EmpInc_IND_30_34_pct[which(sample$YearAt30==i)], na.rm=T))] <-0.5*(1+sample$Child_EmpInc_IND_30_34_pct[which(sample$YearAt30==i & sample$Child_EmpInc_IND_30_34_pct==min(sample$Child_EmpInc_IND_30_34_pct[which(sample$YearAt30==i)], na.rm=T))])

  print(i)
}

# Get parent income percentiles (using Canadian percentiles of adults when child is age 16)
# Need to loop through year and age (e.g., year and age of parent when their child is 16, or 5 year after landing)

# LAD INCOME_PCT ONLY STARTS IN 1982 
for(j in 1982:2019){
  
  # Compare income with rank/percentile of average household income AMONG PARENTS
  

  sample$MainParent_Income_IND_MainParentAge45_49_pct[which(sample$MainParentYearat45==j)] <- findInterval(sample$MainParent_Income_IND_MainParentAge45_49[which(sample$MainParentYearat45==j)], vec=income_pct$pctindTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  sample$MainParent_Income_IND_MainParentAge45_49_pct[which(sample$MainParentYearat45==j  & sample$MainParent_Income_IND_MainParentAge45_49_pct==min(sample$MainParent_Income_IND_MainParentAge45_49_pct[which(sample$MainParentYearat45==j )], na.rm=T))] <-0.5*(1+sample$MainParent_Income_IND_MainParentAge45_49_pct[which(sample$MainParentYearat45==j & sample$MainParent_Income_IND_MainParentAge45_49_pct==min(sample$MainParent_Income_IND_MainParentAge45_49_pct[which(sample$MainParentYearat45==j )], na.rm=T))])
  
  
  sample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(sample$MainParentYearat45==j)] <- findInterval(sample$MainParent_Income_HH_MainParentAge45_49[which(sample$MainParentYearat45==j)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  sample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(sample$MainParentYearat45==j  & sample$MainParent_Income_HH_MainParentAge45_49_pctparent==min(sample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(sample$MainParentYearat45==j )], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(sample$MainParentYearat45==j & sample$MainParent_Income_HH_MainParentAge45_49_pctparent==min(sample$MainParent_Income_HH_MainParentAge45_49_pctparent[which(sample$MainParentYearat45==j )], na.rm=T))])
  
  # For robustness, compare income with rank/percentile of average household income AMONG PARENTS
  sample$MainParent_Income_HH_MainParentAge45_49_pcthh[which(sample$MainParentYearat45==j)] <- findInterval(sample$MainParent_Income_HH_MainParentAge45_49[which(sample$MainParentYearat45==j)], vec=income_pct$pcthhTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  sample$MainParent_Income_HH_MainParentAge45_49_pcthh[which(sample$MainParentYearat45==j  & sample$MainParent_Income_HH_MainParentAge45_49_pcthh==min(sample$MainParent_Income_HH_MainParentAge45_49_pcthh[which(sample$MainParentYearat45==j )], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_MainParentAge45_49_pcthh[which(sample$MainParentYearat45==j & sample$MainParent_Income_HH_MainParentAge45_49_pcthh==min(sample$MainParent_Income_HH_MainParentAge45_49_pcthh[which(sample$MainParentYearat45==j )], na.rm=T))])

  
  # fathers
  sample$Father_Income_HH_45_49_pctparent[which(sample$FatherYearAt45==j)] <- findInterval(sample$Father_Income_HH_45_49[which(sample$FatherYearAt45==j)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  sample$Father_Income_HH_45_49_pctparent[which(sample$FatherYearAt45==j  & sample$Father_Income_HH_45_49_pctparent==min(sample$Father_Income_HH_45_49_pctparent[which(sample$FatherYearAt45==j )], na.rm=T))] <-0.5*(1+sample$Father_Income_HH_45_49_pctparent[which(sample$FatherYearAt45==j  & sample$Father_Income_HH_45_49_pctparent==min(sample$Father_Income_HH_45_49_pctparent[which(sample$FatherYearAt45==j )], na.rm=T))])

  
  # top parent
  sample$TopParent_Income_HH_45_49_pctparent[which(sample$TopParentYearAt45==j)] <- findInterval(sample$TopParent_Income_HH_45_49[which(sample$TopParentYearAt45==j)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  sample$TopParent_Income_HH_45_49_pctparent[which(sample$TopParentYearAt45==j  & sample$TopParent_Income_HH_45_49_pctparent==min(sample$TopParent_Income_HH_45_49_pctparent[which(sample$TopParentYearAt45==j )], na.rm=T))] <-0.5*(1+sample$TopParent_Income_HH_45_49_pctparent[which(sample$TopParentYearAt45==j & sample$TopParent_Income_HH_45_49_pctparent==min(sample$TopParent_Income_HH_45_49_pctparent[which(sample$TopParentYearAt45==j )], na.rm=T))])
  
  # top parent IND
  sample$TopParent_Income_IND_45_49_pct[which(sample$TopParentYearAt45==j)] <- findInterval(sample$TopParent_Income_IND_45_49[which(sample$TopParentYearAt45==j)], vec=income_pct$pctTIRC_real[which(income_pct$year==j & income_pct$age==45)])
  sample$TopParent_Income_IND_45_49_pct[which(sample$TopParentYearAt45==j  & sample$TopParent_Income_IND_45_49_pct==min(sample$TopParent_Income_IND_45_49_pct[which(sample$TopParentYearAt45==j )], na.rm=T))] <-0.5*(1+sample$TopParent_Income_IND_45_49_pct[which(sample$TopParentYearAt45==j & sample$TopParent_Income_IND_45_49_pct==min(sample$TopParent_Income_IND_45_49_pct[which(sample$TopParentYearAt45==j )], na.rm=T))])
  
  for(k in 18:80){
    
    # chetty et al 2014 uses parent income distribution, thus default is to compare against average household income AMONG PARENTS
    sample$MainParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$MainParentAgeAt15==k)] <- findInterval(sample$MainParent_Income_HH_15_19[which(sample$YearAt15==j & sample$MainParentAgeAt15==k)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==k)])
    sample$MainParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$MainParentAgeAt15==k & sample$MainParent_Income_HH_15_19_pct==min(sample$MainParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$MainParentAgeAt15==k)], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$MainParentAgeAt15==k & sample$MainParent_Income_HH_15_19_pct==min(sample$MainParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$MainParentAgeAt15==k)], na.rm=T))])
    
    sample$MainParent_Income_HH_10_19_pct[which(sample$YearAt10==j & sample$MainParentAgeAt10==k)] <- findInterval(sample$MainParent_Income_HH_10_19[which(sample$YearAt10==j & sample$MainParentAgeAt10==k)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==k)])
    sample$MainParent_Income_HH_10_19_pct[which(sample$YearAt10==j & sample$MainParentAgeAt10==k & sample$MainParent_Income_HH_10_19_pct==min(sample$MainParent_Income_HH_10_19_pct[which(sample$YearAt10==j & sample$MainParentAgeAt10==k)], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_10_19_pct[which(sample$YearAt10==j & sample$MainParentAgeAt10==k & sample$MainParent_Income_HH_10_19_pct==min(sample$MainParent_Income_HH_10_19_pct[which(sample$YearAt10==j & sample$MainParentAgeAt10==k)], na.rm=T))])
    
    
    sample$TopParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$TopParentAgeAt15==k)] <- findInterval(sample$TopParent_Income_HH_15_19[which(sample$YearAt15==j & sample$TopParentAgeAt15==k)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==k)])
    sample$TopParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$TopParentAgeAt15==k & sample$TopParent_Income_HH_15_19_pct==min(sample$MainParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$TopParentAgeAt15==k)], na.rm=T))] <-0.5*(1+sample$TopParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$TopParentAgeAt15==k & sample$TopParent_Income_HH_15_19_pct==min(sample$TopParent_Income_HH_15_19_pct[which(sample$YearAt15==j & sample$TopParentAgeAt15==k)], na.rm=T))])

    # Compare post landing income with rank/percentile of average household income AMONG PARENTS
    sample$MainParent_Income_HH_PostLanding5_pctparent[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)] <- findInterval(sample$MainParent_Income_HH_PostLanding5[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==k)])
    sample$MainParent_Income_HH_PostLanding5_pctparent[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k & sample$MainParent_Income_HH_PostLanding5_pctparent==min(sample$MainParent_Income_HH_PostLanding5_pctparent[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_PostLanding5_pctparent[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k & sample$MainParent_Income_HH_PostLanding5_pctparent==min(sample$MainParent_Income_HH_PostLanding5_pctparent[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)], na.rm=T))])
    
    sample$MainParent_Income_HH_PostLanding10_pctparent[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)] <- findInterval(sample$MainParent_Income_HH_PostLanding10[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)], vec=income_pct$pctparentTIRC_real[which(income_pct$year==j & income_pct$age==k)])
    sample$MainParent_Income_HH_PostLanding10_pctparent[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k & sample$MainParent_Income_HH_PostLanding10_pctparent==min(sample$MainParent_Income_HH_PostLanding10_pctparent[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_PostLanding10_pctparent[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k & sample$MainParent_Income_HH_PostLanding10_pctparent==min(sample$MainParent_Income_HH_PostLanding10_pctparent[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)], na.rm=T))])
  
    # For robustness try comparing against rank/percentile of average household income AMONG ALL HOUSEHOLDS   
    sample$MainParent_Income_HH_PostLanding5_pcthh[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)] <- findInterval(sample$MainParent_Income_HH_PostLanding5[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)], vec=income_pct$pcthhTIRC_real[which(income_pct$year==j & income_pct$age==k)])
    sample$MainParent_Income_HH_PostLanding5_pcthh[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k & sample$MainParent_Income_HH_PostLanding5_pcthh==min(sample$MainParent_Income_HH_PostLanding5_pcthh[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_PostLanding5_pcthh[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k & sample$MainParent_Income_HH_PostLanding5_pcthh==min(sample$MainParent_Income_HH_PostLanding5_pcthh[which(sample$YearPost5Parent==j & sample$AgePost5Parent==k)], na.rm=T))])
    
    sample$MainParent_Income_HH_PostLanding10_pcthh[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)] <- findInterval(sample$MainParent_Income_HH_PostLanding10[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)], vec=income_pct$pcthhTIRC_real[which(income_pct$year==j & income_pct$age==k)])
    sample$MainParent_Income_HH_PostLanding10_pcthh[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k & sample$MainParent_Income_HH_PostLanding10_pcthh==min(sample$MainParent_Income_HH_PostLanding10_pcthh[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)], na.rm=T))] <-0.5*(1+sample$MainParent_Income_HH_PostLanding10_pcthh[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k & sample$MainParent_Income_HH_PostLanding10_pcthh==min(sample$MainParent_Income_HH_PostLanding10_pcthh[which(sample$YearPost10Parent==j & sample$AgePost10Parent==k)], na.rm=T))])
  }
  print(j)
}


fwrite(sample, "H:/Zheng_10223/Joint/IGM_dataset_connollyversion_2025.csv")


#### Summary statistics
## This file takes the "IGM_Dataset" file from 0_CleanData.R and merges in the 
# landing information 

# OUTPUT: this file creates the "cohort.csv" file 


library(haven)
library(readr)
library(data.table)
library(scales)
library(lfe)
library(stargazer)

sample <- fread("H:/Zheng_10223/Joint/IGM_dataset_connollyversion_2025.csv")
length(unique(sample$IMDB_ID))
cohort <- sample[!is.na(sample$MainParent_Income_HH_MainParentAge45_49) & !is.na(sample$Child_Income_IND_30_34_pct),]

length(unique(cohort$IMDB_ID))
child_landing <- read_dta("H:/Zheng_10223/Joint/child_pnrf_1980_2020_f3_v1.dta")
child_landing <- data.table(child_landing)
child_landing$PARENT3=child_landing$Parent3; child_landing$Parent3=NULL
child_landing$PARENT4=child_landing$Parent4; child_landing$Parent3=NULL

cohort <- merge(x=cohort, y=child_landing[,c("IMDB_ID", "PARENT3", "PARENT4")], by.x="IMDB_ID", by.y="IMDB_ID")


landing <- read_dta("G:/IMDB_AllYears/rdc/IMDB_BDIM_2022_v1/data_donnees/stata/Core_IMDB/pnrf_1980_2022_f3_v1.dta")
landing <- data.table(landing)


# replace years of schooling 99 value not stated with 10 (so it goes in the less than 12 schooling bin)
landing$YEARS_OF_SCHOOLING[landing$YEARS_OF_SCHOOLING==99]=10

# Country of residence, birth, citizenship
country_crosswalk <- read_csv("H:/Zheng_10223/Joint/countryfiles/country_crosswalk.csv")
landing$COUNTRY_RESIDENCE <- as.numeric(landing$COUNTRY_RESIDENCE)
landing <- merge(x=landing, y=country_crosswalk, by.x="COUNTRY_RESIDENCE", by.y="Code")
setnames(landing, old=c("Country"), new=c("CountryOfLastResidence"))

cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "WORLD_AREA_BIRTH", "CountryOfLastResidence", "DESTINATION_CMA")], by.x="IMDB_ID", by.y="IMDB_ID")
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "YEARS_OF_SCHOOLING","EDUCATION_QUALIFICATION" ,"LANDING_YEAR")], by.x="PARENT1", by.y="IMDB_ID", all.x=T)
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "YEARS_OF_SCHOOLING","EDUCATION_QUALIFICATION",  "LANDING_YEAR")], by.x="PARENT2", by.y="IMDB_ID", all.x=T)
setnames(cohort, old=c("YEARS_OF_SCHOOLING.x", "YEARS_OF_SCHOOLING.y",  "LANDING_YEAR.x", "LANDING_YEAR.y","EDUCATION_QUALIFICATION.x","EDUCATION_QUALIFICATION.y"), new=c("YEARS_OF_SCHOOLING_PARENT1", "YEARS_OF_SCHOOLING_PARENT2", "Landing_Year1", "Landing_Year2","EDUCATION_QUALIFICATION1","EDUCATION_QUALIFICATION2"))
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "YEARS_OF_SCHOOLING","EDUCATION_QUALIFICATION")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
setnames(cohort, old=c("YEARS_OF_SCHOOLING","EDUCATION_QUALIFICATION"), new=c("YEARS_OF_SCHOOLING_MainParent","EDUCATION_QUALIFICATION_Main"))


### Categorize immigration class using main parent, parent 1, 2 and child
# Note that even using up to four individuals, some children are always spouse/dependent
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "IMMIGRATION_CATEGORY_CENSUS")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "IMMIGRATION_CATEGORY_CENSUS")], by.x="PARENT1", by.y="IMDB_ID", all.x=T)
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "IMMIGRATION_CATEGORY_CENSUS")], by.x="PARENT2", by.y="IMDB_ID", all.x=T)
setnames(cohort, old=c("IMMIGRATION_CATEGORY_CENSUS.x", "IMMIGRATION_CATEGORY_CENSUS.y", "IMMIGRATION_CATEGORY_CENSUS"), new=c("Immigrantcategory_main", "Immigrantcategory1", "Immigrantcategory2"))

cohort <- merge(x=cohort, y=child_landing[,c("IMDB_ID", "IMMIGRATION_CATEGORY_CENSUS")], by.x="IMDB_ID", by.y="IMDB_ID")
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "IMMIGRATION_CATEGORY_CENSUS")], by.x="PARENT3", by.y="IMDB_ID", all.x=T)
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "IMMIGRATION_CATEGORY_CENSUS")], by.x="PARENT4", by.y="IMDB_ID", all.x=T)
setnames(cohort, old=c("IMMIGRATION_CATEGORY_CENSUS.x", "IMMIGRATION_CATEGORY_CENSUS.y", "IMMIGRATION_CATEGORY_CENSUS"), new=c("Immigrantcategory_child", "Immigrantcategory3", "Immigrantcategory4"))

# marital status for the child 
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "FAMILY_STATUS", "FAMILY_STATUS_ROLLUP", "MARITAL_STATUS")], by.x="IMDB_ID", by.y="IMDB_ID", all.x=T)

cohort <- cohort[!FAMILY_STATUS %in% c(2,5) & MARITAL_STATUS %in% c(3,9)]



# last tax years and death for the child:   can do better with emigrant status
#cohort <- merge (x=cohort, y=landing[,c("IMDB_ID",)])



# marital status for the parent 
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID",  "MARITAL_STATUS","MARITAL_STATUS_ROLLUP")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
setnames(cohort, old=c("MARITAL_STATUS.x","MARITAL_STATUS.y","MARITAL_STATUS_ROLLUP"), new=c("Marital_status_child","Marital_status_main","MARITAL_STATUS_ROLLUP_main"))




cohort$ImmigrationCategory <- NA
cohort$ImmigrationCategory[which(cohort$Immigrantcategory_main %in% c('A1111', 'A1115', 'A1120', 'A1130', 'A1141', 'A1142', 'A1150', 'A1160', 'A1231', 'A1235', 'A1300')|
                                   cohort$Immigrantcategory1 %in% c('A1111', 'A1115', 'A1120', 'A1130', 'A1141', 'A1142', 'A1150', 'A1160', 'A1231', 'A1235', 'A1300')|
                                   cohort$Immigrantcategory2 %in% c('A1111', 'A1115', 'A1120', 'A1130', 'A1141', 'A1142', 'A1150', 'A1160', 'A1231', 'A1235', 'A1300')|
                                   cohort$Immigrantcategory_child %in% c('A1111', 'A1115', 'A1120', 'A1130', 'A1141', 'A1142', 'A1150', 'A1160', 'A1231', 'A1235', 'A1300'))] <- "Economic"

cohort$ImmigrationCategory[which(cohort$Immigrantcategory_main %in% c('A1211', 'A1212', 'A1215', 'A1221', 'A1225')|
                                   cohort$Immigrantcategory1 %in% c('A1211', 'A1212', 'A1215', 'A1221', 'A1225')|
                                   cohort$Immigrantcategory2 %in% c('A1211', 'A1212', 'A1215', 'A1221', 'A1225')|
                                   cohort$Immigrantcategory_child %in% c('A1211', 'A1212', 'A1215', 'A1221', 'A1225'))] <- "Business"

cohort$ImmigrationCategory[which(cohort$Immigrantcategory_main %in% c('C3110', 'C3120', 'C3210', 'C3220', 'C3230', 'D4110', 'D4120', 'D4210', 'Z9991')|
                                   cohort$Immigrantcategory1 %in% c('C3110', 'C3120', 'C3210', 'C3220', 'C3230', 'D4110', 'D4120', 'D4210', 'Z9991')|
                                   cohort$Immigrantcategory2 %in% c('C3110', 'C3120', 'C3210', 'C3220', 'C3230', 'D4110', 'D4120', 'D4210', 'Z9991')|
                                   cohort$Immigrantcategory_child %in% c('C3110', 'C3120', 'C3210', 'C3220', 'C3230', 'D4110', 'D4120', 'D4210', 'Z9991'))]<- "Refugee"

cohort$ImmigrationCategory[which(is.na(cohort$ImmigrationCategory))] <- "Missing"

#cohort <- merge(x=cohort, y=landing[,c("IMDB_ID",)], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "gender")], by.x="IMDB_ID", by.y="IMDB_ID", all.x=T)
setnames(cohort, old=c("gender"), new=c("Gender_Child"))

cohort$FemaleChild <- (cohort$Gender_Child==2)
cohort$FemaleChild[which(!cohort$Gender_Child  %in% c(1,2))] <- NA
cohort$Status_Economic <- (cohort$ImmigrationCategory=="Economic")
cohort$Status_Business <- (cohort$ImmigrationCategory=="Business")
cohort$Status_Refugee <- (cohort$ImmigrationCategory=="Refugee")
cohort$Status_Unknown <- (cohort$ImmigrationCategory=="Missing")

cohort$WORLD_AREA_BIRTH <- as.character(cohort$WORLD_AREA_BIRTH)
cohort$WORLD_AREA_BIRTH[which(cohort$WORLD_AREA_BIRTH=="1")] <- "Europe"; cohort$Europe <- cohort$WORLD_AREA_BIRTH=="Europe"
cohort$WORLD_AREA_BIRTH[which(cohort$WORLD_AREA_BIRTH=="2")] <- "Africa and Middle East"; cohort$AfricaMiddleEast <- cohort$WORLD_AREA_BIRTH=="Africa and Middle East"
cohort$WORLD_AREA_BIRTH[which(cohort$WORLD_AREA_BIRTH=="3")] <- "Southern Asia"; cohort$SouthernAsia <- cohort$WORLD_AREA_BIRTH=="Southern Asia"
cohort$WORLD_AREA_BIRTH[which(cohort$WORLD_AREA_BIRTH=="4")] <- "Eastern Asia"; cohort$EasternAsia <- cohort$WORLD_AREA_BIRTH=="Eastern Asia"
cohort$WORLD_AREA_BIRTH[which(cohort$WORLD_AREA_BIRTH=="5")] <- "Oceania and other Asia"; cohort$Oceania <- cohort$WORLD_AREA_BIRTH=="Oceania and other Asia"
cohort$WORLD_AREA_BIRTH[which(cohort$WORLD_AREA_BIRTH=="6")] <- "South and Central America"; cohort$CentralSouthAmerica <- cohort$WORLD_AREA_BIRTH=="South and Central America"
cohort$WORLD_AREA_BIRTH[which(cohort$WORLD_AREA_BIRTH=="7")] <- "US or other"; cohort$USOrOther <- cohort$WORLD_AREA_BIRTH=="US or other"

cohort$DESTINATION_CMA <- as.character(cohort$DESTINATION_CMA)
cohort$DESTINATION_CMA[which(cohort$DESTINATION_CMA=="35535")] <- "Toronto"; cohort$Toronto <- cohort$DESTINATION_CMA=="Toronto"
cohort$DESTINATION_CMA[which(cohort$DESTINATION_CMA=="24462")] <- "Montreal"; cohort$Montreal <- cohort$DESTINATION_CMA=="Montreal"
cohort$DESTINATION_CMA[which(cohort$DESTINATION_CMA=="59933")] <- "Vancouver"; cohort$Vancouver <- cohort$DESTINATION_CMA=="Vancouver"
cohort$DESTINATION_CMA[which(cohort$DESTINATION_CMA=="48825")] <- "Calgary"; cohort$Calgary <- cohort$DESTINATION_CMA=="Calgary"
cohort$DESTINATION_CMA[which(cohort$DESTINATION_CMA=="48835")] <- "Edmonton"; cohort$Edmonton <- cohort$DESTINATION_CMA=="Edmonton"


# Max year of schooling among two parents
# cohort$YEARS_OF_SCHOOLING_PARENT1[which(cohort$YEARS_OF_SCHOOLING_PARENT1==99)] <- NA; cohort$YEARS_OF_SCHOOLING_PARENT1[which(cohort$YEARS_OF_SCHOOLING_PARENT1<0 | cohort$YEARS_OF_SCHOOLING_PARENT1>23)] <- NA; 
# cohort$YEARS_OF_SCHOOLING_PARENT2[which(cohort$YEARS_OF_SCHOOLING_PARENT2==99)] <- NA; cohort$YEARS_OF_SCHOOLING_PARENT2[which(cohort$YEARS_OF_SCHOOLING_PARENT2<0 | cohort$YEARS_OF_SCHOOLING_PARENT2>23)] <- NA; 
# cohort$YEARS_OF_SCHOOLING_PARENT_MAX=pmax(cohort$YEARS_OF_SCHOOLING_PARENT1, cohort$YEARS_OF_SCHOOLING_PARENT2, na.rm=T)
# cohort$SchoolingBins_max <- NA
# cohort$SchoolingBins_max[which(cohort$YEARS_OF_SCHOOLING_PARENT_MAX<12)] <- "<12"
# cohort$SchoolingBins_max[which(cohort$YEARS_OF_SCHOOLING_PARENT_MAX==12)] <- "12"
# cohort$SchoolingBins_max[which(cohort$YEARS_OF_SCHOOLING_PARENT_MAX>12 & cohort$YEARS_OF_SCHOOLING_PARENT_MAX<=15)] <- "12-15"
# cohort$SchoolingBins_max[which(cohort$YEARS_OF_SCHOOLING_PARENT_MAX==16)] <- "16"
# cohort$SchoolingBins_max[which(cohort$YEARS_OF_SCHOOLING_PARENT_MAX>16 & cohort$YEARS_OF_SCHOOLING_PARENT_MAX<99)] <- ">16"
# cohort$LTHS <- (cohort$SchoolingBins=="<12")
# cohort$HSGrad <- (cohort$SchoolingBins=="12")
# cohort$SomeCollege <- (cohort$SchoolingBins=="12-15")
# cohort$College <- (cohort$SchoolingBins=="16")
# cohort$CollegePlus <- (cohort$SchoolingBins==">16")
# 
# # English ability of main parent
# cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "ENGLISH_IND_PRIOR2019", "ENGLISH_IND", "FRENCH_IND_PRIOR2019", "FRENCH_IND", "OFFICIAL_LANGUAGE")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
# 
# setnames(cohort, old=c("ENGLISH_IND_PRIOR2019", "ENGLISH_IND", "FRENCH_IND_PRIOR2019", "FRENCH_IND", "OFFICIAL_LANGUAGE"), new=c("ENGLISH_IND_PRIOR2019_Main", "ENGLISH_IND_Main", "FRENCH_IND_PRIOR2019_Main", "FRENCH_IND_Main", "OFFICIAL_LANGUAGE_Main"))
# 
# 
# cohort$EnglishMotherTongue_Main <- (cohort$ENGLISH_IND_Main==1|cohort$ENGLISH_IND_PRIOR2019_Main==1)
# cohort$AnyEnglish_Main <- (cohort$OFFICIAL_LANGUAGE_Main %in% c(1,3) | cohort$ENGLISH_IND_Main==1|cohort$ENGLISH_IND_PRIOR2019_Main==1)
# cohort$FrenchMotherTongue_Main <- (cohort$FRENCH_IND_Main==1|cohort$FRENCH_IND_PRIOR2019_Main==1)
# cohort$AnyFrench_Main <- (cohort$OFFICIAL_LANGUAGE_Main %in% c(2,3) | cohort$FRENCH_IND_Main==1|cohort$FRENCH_IND_PRIOR2019_Main==1)


# Language for Parent 1
# 
# cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "ENGLISH_IND_PRIOR2019", "ENGLISH_IND", "FRENCH_IND_PRIOR2019", "FRENCH_IND", "OFFICIAL_LANGUAGE")], by.x="PARENT1", by.y="IMDB_ID", all.x=T)
# setnames(cohort, old=c("ENGLISH_IND_PRIOR2019", "ENGLISH_IND", "FRENCH_IND_PRIOR2019", "FRENCH_IND", "OFFICIAL_LANGUAGE"), new=c("ENGLISH_IND_PRIOR2019_Parent1", "ENGLISH_IND_Parent1", "FRENCH_IND_PRIOR2019_Parent1", "FRENCH_IND_Parent1", "OFFICIAL_LANGUAGE_Parent1"))
# 
# 
# 
# cohort$EnglishMotherTongue_Parent1 <- (cohort$ENGLISH_IND_Parent1==1|cohort$ENGLISH_IND_PRIOR2019_Parent1==1)
# cohort$AnyEnglish_Parent1 <- (cohort$OFFICIAL_LANGUAGE_Parent1 %in% c(1,3) | cohort$ENGLISH_IND_Parent1==1|cohort$ENGLISH_IND_PRIOR2019_Parent1==1)
# cohort$FrenchMotherTongue_Parent1 <- (cohort$FRENCH_IND_Parent1==1|cohort$FRENCH_IND_PRIOR2019_Parent1==1)
# cohort$AnyFrench_Parent1 <- (cohort$OFFICIAL_LANGUAGE_Parent1 %in% c(2,3) | cohort$FRENCH_IND_Parent1==1|cohort$FRENCH_IND_PRIOR2019_Parent1==1)


# Language for Parent 2 

# 
# 
# cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "ENGLISH_IND_PRIOR2019", "ENGLISH_IND", "FRENCH_IND_PRIOR2019", "FRENCH_IND", "OFFICIAL_LANGUAGE")], by.x="PARENT2", by.y="IMDB_ID", all.x=T)
# setnames(cohort, old=c("ENGLISH_IND_PRIOR2019", "ENGLISH_IND", "FRENCH_IND_PRIOR2019", "FRENCH_IND", "OFFICIAL_LANGUAGE"), new=c("ENGLISH_IND_PRIOR2019_Parent2", "ENGLISH_IND_Parent2", "FRENCH_IND_PRIOR2019_Parent2", "FRENCH_IND_Parent2", "OFFICIAL_LANGUAGE_Parent2"))
# 
# 
# 
# cohort$EnglishMotherTongue_Parent2 <- (cohort$ENGLISH_IND_Parent2==1|cohort$ENGLISH_IND_PRIOR2019_Parent2==1)
# cohort$AnyEnglish_Parent2 <- (cohort$OFFICIAL_LANGUAGE_Parent2 %in% c(1,3) | cohort$ENGLISH_IND_Parent2==1|cohort$ENGLISH_IND_PRIOR2019_Parent2==1)
# cohort$FrenchMotherTongue_Parent2 <- (cohort$FRENCH_IND_Parent2==1|cohort$FRENCH_IND_PRIOR2019_Parent2==1)
# cohort$AnyFrench_Parent2 <- (cohort$OFFICIAL_LANGUAGE_Parent2 %in% c(2,3) | cohort$FRENCH_IND_Parent2==1|cohort$FRENCH_IND_PRIOR2019_Parent2==1)
# 
# 
# 
# # INTENDED OCCUPATION AND SKILL FOR mAIN 
# cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "NOC2_CD11","NOC3_CD11","SKILL_LEVEL_CD11")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
# setnames(cohort,old=c("NOC2_CD11","NOC3_CD11","SKILL_LEVEL_CD11"), new=c("NOC2_CD11_Main","NOC3_CD11_Main","SKILL_LEVEL_CD11_Main"))
# 
# 
# 
# 
# 
# # INTENDED OCCUPATION AND SKILL FOR Parent 1 
# cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "NOC2_CD11","NOC3_CD11","SKILL_LEVEL_CD11")], by.x="PARENT1", by.y="IMDB_ID", all.x=T)
# setnames(cohort,old=c("NOC2_CD11","NOC3_CD11","SKILL_LEVEL_CD11"), new=c("NOC2_CD11_Parent1","NOC3_CD11_Parent1","SKILL_LEVEL_CD11_Parent1"))
# 
# # INTENDED OCCUPATION AND SKILL FOR Parent 2 
# cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "NOC2_CD11","NOC3_CD11","SKILL_LEVEL_CD11")], by.x="PARENT2", by.y="IMDB_ID", all.x=T)
# setnames(cohort,old=c("NOC2_CD11","NOC3_CD11","SKILL_LEVEL_CD11"), new=c("NOC2_CD11_Parent2","NOC3_CD11_Parent2","SKILL_LEVEL_CD11_Parent2"))



######## Language, Schooling, Skill for Father and Top Parent: (not needed for connolly analysis)

# Language 
# cohort$AnyEnglish_Father=NA
# cohort$AnyEnglish_Father[cohort$Father=="Parent1"]=cohort$AnyEnglish_Parent1[cohort$Father=="Parent1"]
# cohort$AnyEnglish_Father[cohort$Father=="Parent2"]=cohort$AnyEnglish_Parent2[cohort$Father=="Parent2"]
# cohort$AnyEnglish_Father[cohort$Father=="MAIN_PARENT"]=cohort$AnyEnglish_Main[cohort$Father=="MAIN_PARENT"]
# 
# cohort$AnyFrench_Father=NA
# cohort$AnyFrench_Father[cohort$Father=="Parent1"]=cohort$AnyFrench_Parent1[cohort$Father=="Parent1"]
# cohort$AnyFrench_Father[cohort$Father=="Parent2"]=cohort$AnyFrench_Parent2[cohort$Father=="Parent2"]
# cohort$AnyFrench_Father[cohort$Father=="MAIN_PARENT"]=cohort$AnyFrench_Main[cohort$Father=="MAIN_PARENT"]
# 
# cohort$AnyEnglish_Top=NA
# cohort$AnyFrench_Top=NA
# cohort$AnyEnglish_Top[cohort$TopParent=="Parent1"]=cohort$AnyEnglish_Parent1[cohort$TopParent=="Parent1"]
# cohort$AnyEnglish_Top[cohort$TopParent=="Parent2"]=cohort$AnyEnglish_Parent2[cohort$TopParent=="Parent2"]
# cohort$AnyEnglish_Top[cohort$TopParent=="MAIN_PARENT"]=cohort$AnyEnglish_Main[cohort$TopParent=="MAIN_PARENT"]
# 
# cohort$AnyFrench_Top[cohort$TopParent=="Parent1"]=cohort$AnyFrench_Parent1[cohort$TopParent=="Parent1"]
# cohort$AnyFrench_Top[cohort$TopParent=="Parent2"]=cohort$AnyFrench_Parent2[cohort$TopParent=="Parent2"]
# cohort$AnyFrench_Top[cohort$TopParent=="MAIN_PARENT"]=cohort$AnyFrench_Main[cohort$TopParent=="MAIN_PARENT"]
# 


# Schooling 
# cohort$SchoolingBins_Main <- NA
# cohort$SchoolingBins_Main[which(cohort$YEARS_OF_SCHOOLING_MainParent<12)] <- "<12"
# cohort$SchoolingBins_Main[which(cohort$YEARS_OF_SCHOOLING_MainParent==12)] <- "12"
# cohort$SchoolingBins_Main[which(cohort$YEARS_OF_SCHOOLING_MainParent>12 & cohort$YEARS_OF_SCHOOLING_MainParent<=15)] <- "12-15"
# cohort$SchoolingBins_Main[which(cohort$YEARS_OF_SCHOOLING_MainParent==16)] <- "16"
# cohort$SchoolingBins_Main[which(cohort$YEARS_OF_SCHOOLING_MainParent>16 & cohort$YEARS_OF_SCHOOLING_MainParent<99)] <- ">16"
# 
# cohort$schooling_Father=NA
# cohort$schooling_Father[cohort$Father=="Parent1"]=cohort$YEARS_OF_SCHOOLING_PARENT1[cohort$Father=="Parent1"]
# cohort$schooling_Father[cohort$Father=="Parent2"]=cohort$YEARS_OF_SCHOOLING_PARENT2[cohort$Father=="Parent2"]
# cohort$schooling_Father[cohort$Father=="MAIN_PARENT"]=cohort$YEARS_OF_SCHOOLING_MainParent[cohort$Father=="MAIN_PARENT"]
# 
# cohort$educqual_Father=NA
# cohort$educqual_Father[cohort$Father=="Parent1"]=cohort$EDUCATION_QUALIFICATION1[cohort$Father=="Parent1"]
# cohort$educqual_Father[cohort$Father=="Parent2"]=cohort$EDUCATION_QUALIFICATION2[cohort$Father=="Parent2"]
# cohort$educqual_Father[cohort$Father=="MAIN_PARENT"]=cohort$EDUCATION_QUALIFICATION_Main[cohort$Father=="MAIN_PARENT"]
# 
# cohort$educqual_Top=NA
# cohort$educqual_Top[cohort$TopParent=="Parent1"]=cohort$EDUCATION_QUALIFICATION1[cohort$TopParent=="Parent1"]
# cohort$educqual_Top[cohort$TopParent=="Parent2"]=cohort$EDUCATION_QUALIFICATION2[cohort$TopParent=="Parent2"]
# cohort$educqual_Top[cohort$TopParent=="MAIN_PARENT"]=cohort$EDUCATION_QUALIFICATION_Main[cohort$TopParent=="MAIN_PARENT"]
# 
# cohort$SchoolingBins_Father <- NA
# cohort$SchoolingBins_Father[which(cohort$schooling_Father<12)] <- "<12"
# cohort$SchoolingBins_Father[which(cohort$schooling_Father==12)] <- "12"
# cohort$SchoolingBins_Father[which(cohort$schooling_Father>12 & cohort$schooling_Father<=15)] <- "12-15"
# cohort$SchoolingBins_Father[which(cohort$schooling_Father==16)] <- "16"
# cohort$SchoolingBins_Father[which(cohort$schooling_Father>16 & cohort$schooling_Father<99)] <- ">16"
# 
# cohort$schooling_Top=NA
# cohort$schooling_Top[cohort$TopParent=="Parent1"]=cohort$YEARS_OF_SCHOOLING_PARENT1[cohort$TopParent=="Parent1"]
# cohort$schooling_Top[cohort$TopParent=="Parent2"]=cohort$YEARS_OF_SCHOOLING_PARENT2[cohort$TopParent=="Parent2"]
# cohort$schooling_Top[cohort$TopParent=="MAIN_PARENT"]=cohort$YEARS_OF_SCHOOLING_MainParent[cohort$TopParent=="MAIN_PARENT"]
# 
# cohort$SchoolingBins_Top <- NA
# cohort$SchoolingBins_Top[which(cohort$schooling_Top<12)] <- "<12"
# cohort$SchoolingBins_Top[which(cohort$schooling_Top==12)] <- "12"
# cohort$SchoolingBins_Top[which(cohort$schooling_Top>12 & cohort$schooling_Top<=15)] <- "12-15"
# cohort$SchoolingBins_Top[which(cohort$schooling_Top==16)] <- "16"
# cohort$SchoolingBins_Top[which(cohort$schooling_Top>16 & cohort$schooling_Top<99)] <- ">16"


# Skill 
# cohort$SKILL_LEVEL_CD11_Father= NA
# cohort$SKILL_LEVEL_CD11_Father[cohort$Father=="Parent1"]=cohort$SKILL_LEVEL_CD11_Parent1[cohort$Father=="Parent1"]
# cohort$SKILL_LEVEL_CD11_Father[cohort$Father=="Parent2"]=cohort$SKILL_LEVEL_CD11_Parent2[cohort$Father=="Parent2"]
# cohort$SKILL_LEVEL_CD11_Father[cohort$Father=="MAIN_PARENT"]=cohort$SKILL_LEVEL_CD11_Main[cohort$Father=="MAIN_PARENT"]
# 
# cohort$NOC2_CD11_Father= NA
# cohort$NOC2_CD11_Father[cohort$Father=="Parent1"]=cohort$NOC2_CD11_Parent1[cohort$Father=="Parent1"]
# cohort$NOC2_CD11_Father[cohort$Father=="Parent2"]=cohort$NOC2_CD11_Parent2[cohort$Father=="Parent2"]
# cohort$NOC2_CD11_Father[cohort$Father=="MAIN_PARENT"]=cohort$NOC2_CD11_Main[cohort$Father=="MAIN_PARENT"]
# 
# cohort$NOC3_CD11_Father= NA
# cohort$NOC3_CD11_Father[cohort$Father=="Parent1"]=cohort$NOC3_CD11_Parent1[cohort$Father=="Parent1"]
# cohort$NOC3_CD11_Father[cohort$Father=="Parent2"]=cohort$NOC3_CD11_Parent2[cohort$Father=="Parent2"]
# cohort$NOC3_CD11_Father[cohort$Father=="MAIN_PARENT"]=cohort$NOC3_CD11_Main[cohort$Father=="MAIN_PARENT"]
# 
# 
# cohort$SKILL_LEVEL_CD11_Top= NA
# cohort$SKILL_LEVEL_CD11_Top[cohort$TopParent=="Parent1"]=cohort$SKILL_LEVEL_CD11_Parent1[cohort$TopParent=="Parent1"]
# cohort$SKILL_LEVEL_CD11_Top[cohort$TopParent=="Parent2"]=cohort$SKILL_LEVEL_CD11_Parent2[cohort$TopParent=="Parent2"]
# cohort$SKILL_LEVEL_CD11_Top[cohort$TopParent=="MAIN_PARENT"]=cohort$SKILL_LEVEL_CD11_Main[cohort$TopParent=="MAIN_PARENT"]
# 
# cohort$NOC2_CD11_Top= NA
# cohort$NOC2_CD11_Top[cohort$TopParent=="Parent1"]=cohort$NOC2_CD11_Parent1[cohort$TopParent=="Parent1"]
# cohort$NOC2_CD11_Top[cohort$TopParent=="Parent2"]=cohort$NOC2_CD11_Parent2[cohort$TopParent=="Parent2"]
# cohort$NOC2_CD11_Top[cohort$TopParent=="MAIN_PARENT"]=cohort$NOC2_CD11_Main[cohort$TopParent=="MAIN_PARENT"]
# 
# cohort$NOC3_CD11_Top= NA
# cohort$NOC3_CD11_Top[cohort$TopParent=="Parent1"]=cohort$NOC3_CD11_Parent1[cohort$TopParent=="Parent1"]
# cohort$NOC3_CD11_Top[cohort$TopParent=="Parent2"]=cohort$NOC3_CD11_Parent2[cohort$TopParent=="Parent2"]
# cohort$NOC3_CD11_Top[cohort$TopParent=="MAIN_PARENT"]=cohort$NOC3_CD11_Main[cohort$TopParent=="MAIN_PARENT"]
# 
# 
# cohort$IntendedOccupation_Main[cohort$SKILL_LEVEL_CD11_Main=="0"]="Managerial/Professional"
# cohort$IntendedOccupation_Main[cohort$SKILL_LEVEL_CD11_Main=="A"]="Managerial/Professional"
# cohort$IntendedOccupation_Main[cohort$SKILL_LEVEL_CD11_Main=="B"]="Skilled and Technical"
# cohort$IntendedOccupation_Main[cohort$SKILL_LEVEL_CD11_Main=="C"]="Clerical and Laborers"
# cohort$IntendedOccupation_Main[cohort$SKILL_LEVEL_CD11_Main=="D"]="Clerical and Laborers"
# cohort$IntendedOccupation_Main[cohort$SKILL_LEVEL_CD11_Main=="N"]="New Workers"
# cohort$IntendedOccupation_Main[cohort$SKILL_LEVEL_CD11_Main %in% c("O","R","S")]="Non-Workers"
# 
# cohort$IntendedOccupation_Father[cohort$SKILL_LEVEL_CD11_Father=="0"]="Managerial/Professional"
# cohort$IntendedOccupation_Father[cohort$SKILL_LEVEL_CD11_Father=="A"]="Managerial/Professional"
# cohort$IntendedOccupation_Father[cohort$SKILL_LEVEL_CD11_Father=="B"]="Skilled and Technical"
# cohort$IntendedOccupation_Father[cohort$SKILL_LEVEL_CD11_Father=="C"]="Clerical and Laborers"
# cohort$IntendedOccupation_Father[cohort$SKILL_LEVEL_CD11_Father=="D"]="Clerical and Laborers"
# cohort$IntendedOccupation_Father[cohort$SKILL_LEVEL_CD11_Father=="N"]="New Workers"
# cohort$IntendedOccupation_Father[cohort$SKILL_LEVEL_CD11_Father %in% c("O","R","S")]="Non-Workers"
# 
# cohort$IntendedOccupation_Top[cohort$SKILL_LEVEL_CD11_Top=="0"]="Managerial/Professional"
# cohort$IntendedOccupation_Top[cohort$SKILL_LEVEL_CD11_Top=="A"]="Managerial/Professional"
# cohort$IntendedOccupation_Top[cohort$SKILL_LEVEL_CD11_Top=="B"]="Skilled and Technical"
# cohort$IntendedOccupation_Top[cohort$SKILL_LEVEL_CD11_Top=="C"]="Clerical and Laborers"
# cohort$IntendedOccupation_Top[cohort$SKILL_LEVEL_CD11_Top=="D"]="Clerical and Laborers"
# cohort$IntendedOccupation_Top[cohort$SKILL_LEVEL_CD11_Top=="N"]="New Workers"
# cohort$IntendedOccupation_Top[cohort$SKILL_LEVEL_CD11_Top %in% c("O","R","S")]="Non-Workers"
# 




################################################### clean
library(dplyr)
################## country where CHILD was born in:
cohort=merge(x=cohort, y=landing[,c("IMDB_ID","COUNTRY_BIRTH")], by="IMDB_ID")
cohort$birthcountry_child=cohort$COUNTRY_BIRTH; cohort$COUNTRY_BIRTH=NULL

# imposes common support 
cohort$refugeein=ifelse(cohort$ImmigrationCategory=="Refugee",1,0)
countrycount=cohort %>% group_by(birthcountry_child,refugeein) %>% summarize(count=n()) %>% ungroup()

# keep with groups of more than 30 people
countrycountsum=countrycount %>% filter (count>30) %>% group_by(birthcountry_child) %>% summarize(count=n())



#indicator if more than 30 people 
countrykeep=countrycountsum$birthcountry_child[countrycountsum$count>1]

cohort$birthcountrydrop=ifelse(cohort$birthcountry_child %in% countrykeep,0,1)


########### cma tab 
# CMA tabulations
dfcmatab=cohort %>% group_by(DESTINATION_CMA, refugeein) %>% summarize(count=n()) %>% ungroup()

cmacountsum= dfcmatab %>% filter(count>30) %>% group_by(DESTINATION_CMA) %>% summarize(count=n())
keepcma=cmacountsum$DESTINATION_CMA[cmacountsum$count>1]

cohort$cmadrop=ifelse(cohort$DESTINATION_CMA %in% keepcma,0,1)

# Refugee type 
# get list of imdb_ids that came under private sponsor refugee
privaterefugee_ids=landing$IMDB_ID[landing$IMMIGRATION_CATEGORY_CENSUS %in% c("C3220")]
blendedrefugee_ids=landing$IMDB_ID[landing$IMMIGRATION_CATEGORY_CENSUS %in% c("C3230")]

# 
# cohort$refugeetype=NA
# cohort$refugeetype[cohort$IMDB_ID %in% privaterefugee_ids]="Private"
# cohort$refugeetype[cohort$IMDB_ID %in% blendedrefugee_ids]="Blended"
# cohort$refugeetype[cohort$ImmigrationCategory=="Refugee" & is.na(cohort$refugeetype)]="Public"

# destination province: manage qc
cohort=merge(cohort,landing[,c("IMDB_ID","DESTINATION_PROVINCE")], by="IMDB_ID")

fwrite(cohort,"H:/Zheng_10223/Joint/cohort2025_connolly.csv")
############################## Connolly bin scatter ######### 
